{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型验证和调参"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据集&特征编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import os\n",
    "from sklearn import metrics\n",
    "# from sklearn.cross_validation import train_test_split\n",
    "from sklearn import preprocessing\n",
    "import pickle\n",
    "# 获取当前的工作目录\n",
    "pwd = os.getcwd()\n",
    "# 将工作目录更改到训练集\n",
    "os.chdir(\"处理后训练集\")\n",
    "# ——————————————————————————读取训练数据—————————————————————————————— #\n",
    "# 航班数据\n",
    "with open('colums_type.pickle','rb') as f:\n",
    "    colums_types = pickle.load(f)\n",
    "train_data = pd.read_table('train_data1.csv',sep=',',encoding='gb2312',dtype=colums_types)\n",
    "os.chdir(pwd)\n",
    "# 改回原来工作目录\n",
    "\n",
    "# 填充缺失值\n",
    "train_data['到达天气'] = train_data['到达天气'].astype('object')\n",
    "train_data['到达天气']=train_data['到达天气'].fillna('0')\n",
    "train_data['出发天气'] = train_data['出发天气'].astype('object')\n",
    "train_data['出发天气']=train_data['出发天气'].fillna('0')\n",
    "train_data['前序延误']=train_data['前序延误'].fillna(train_data['前序延误'].mean()) #均值填充\n",
    "train_data['出发气温'] = train_data['出发气温'].astype('object')\n",
    "train_data['出发气温']=train_data['出发气温'].fillna('一般')\n",
    "train_data['到达气温'] = train_data['到达气温'].astype('object')\n",
    "train_data['到达气温']=train_data['到达气温'].fillna('一般')\n",
    "train_data['起飞间隔'] = train_data['起飞间隔'].fillna(train_data['起飞间隔'].mean())#均值填充\n",
    "# train_data = train_data.dropna()\n",
    "# 分类变量\n",
    "targets = np.where(train_data['起飞延误时间']>3,1,0)\n",
    "\n",
    "del(train_data['起飞延误时间'])\n",
    "\n",
    "# ——————————————————————————读取测试数据—————————————————————————————— #\n",
    "# 工作目录更改到测试集以读取数据\n",
    "os.chdir(\"处理后测试集\")\n",
    "# 航班数据\n",
    "test_data = pd.read_table('test_data1.csv',sep=',',encoding='gb2312')\n",
    "# 改回工作目录\n",
    "os.chdir(pwd)\n",
    "train_columns = train_data.columns\n",
    "test_data = test_data[train_columns]\n",
    "# 填充缺失值\n",
    "test_data['到达天气'] = test_data['到达天气'].astype('object')\n",
    "test_data['到达天气']=test_data['到达天气'].fillna('0')\n",
    "test_data['出发天气'] = test_data['出发天气'].astype('object')\n",
    "test_data['出发天气']=test_data['出发天气'].fillna('0')\n",
    "test_data['前序延误']=test_data['前序延误'].fillna(test_data['前序延误'].mean()) #均值填充\n",
    "test_data['出发气温'] = test_data['出发气温'].astype('object')\n",
    "test_data['出发气温']=test_data['出发气温'].fillna('一般')\n",
    "test_data['到达气温'] = test_data['到达气温'].astype('object')\n",
    "test_data['到达气温']=test_data['到达气温'].fillna('一般')\n",
    "test_data['起飞间隔'] = test_data['起飞间隔'].fillna(train_data['起飞间隔'].mean()) #均值填充\n",
    "\n",
    "# 编码\n",
    "# 对字符型数据编码\n",
    "columns = ['出发机场', '到达机场', '出发天气', '到达天气','航班编号','出发气温','到达气温','航空公司']\n",
    "\n",
    "le = preprocessing.LabelEncoder()\n",
    "for col in columns :\n",
    "    train_data[col] = le.fit_transform(train_data[col])\n",
    "#     未见过的字符加进去\n",
    "    clas = list(le.classes_)\n",
    "    clas = clas + list(test_data[col])\n",
    "    le.classes_ = np.array(np.unique(clas))\n",
    "    \n",
    "    test_data[col] = le.transform(test_data[col])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "#  划分样本集\n",
    "train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.25,random_state=66)\n",
    "#---------------------------------------------------网格搜索参数-------------------------------------#\n",
    "param = {'n_estimators':[i for i in range(200,600,100)],\n",
    "        'learning_rate':[i/100 for i in range(5,15,2)]}\n",
    "grid_search = GridSearchCV(GradientBoostingClassifier(loss = 'exponential',max_depth=4,min_samples_split=10,\n",
    "                        min_weight_fraction_leaf=0.01,subsample=0.9),\n",
    "                    cv=5,param_grid=param,n_jobs=-1,scoring='roc_auc')\n",
    "grid_search.fit(train_x,train_y)\n",
    "#-----------------------------------------------------输出交叉验证的结果------------------------------#\n",
    "print(\"Best parameters set found on development set:\")\n",
    "print()\n",
    "print(grid_search.best_params_)\n",
    "print('grid_scores_:')\n",
    "print(grid_search.grid_scores_)\n",
    "print('best_score_')\n",
    "print(grid_search.best_score_)\n",
    "\n",
    "# 预测测试集的结果\n",
    "pre_y = grid_search.predict_proba(test_x)[:,1]\n",
    "# 计算auc\n",
    "fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y)\n",
    "auc=metrics.auc(fpr, tpr)\n",
    "print(\"AUC得分为：\")\n",
    "print(auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
